{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extracting a Custom Property"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from chemdataextractor import Document\n",
    "from chemdataextractor.model import Compound\n",
    "from chemdataextractor.doc import Paragraph, Heading"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example Document"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's create a simple example document with a single heading followed by a single paragraph:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "d = Document(\n",
    "    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),\n",
    "    Paragraph(u'The procedure was followed to yield a pale yellow solid (b.p. 240 °C)')\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "What does this look like:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div class=\"cde-document\">\n",
       "<h2 class=\"cde-title\">Synthesis of 2,4,6-trinitrotoluene (3a)</h2>\n",
       "<p class=\"cde-paragraph\">The procedure was followed to yield a pale yellow solid (b.p. 240 °C)</p>\n",
       "</div>"
      ],
      "text/plain": [
       "<Document: 2 elements>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Default Parsers\n",
    "\n",
    "By default, ChemDataExtractor won't extract the boiling point property:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'labels': ['3a'], 'names': ['2,4,6-trinitrotoluene'], 'roles': ['product']}]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d.records.serialize()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Defining a New Property Model\n",
    "\n",
    "The first task is to define the schema of a new property, and add it to the `Compound` model:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from chemdataextractor.model import BaseModel, StringType, ListType, ModelType\n",
    "\n",
    "class BoilingPoint(BaseModel):\n",
    "    value = StringType()\n",
    "    units = StringType()\n",
    "    \n",
    "Compound.boiling_points = ListType(ModelType(BoilingPoint))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Writing a New Parser\n",
    "\n",
    "Next, define parsing rules that define how to interpret text and convert it into the model:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import re\n",
    "from chemdataextractor.parse import R, I, W, Optional, merge\n",
    "\n",
    "prefix = (R(u'^b\\.?p\\.?$', re.I) | I(u'boiling') + I(u'point')).hide()\n",
    "units = (W(u'°') + Optional(R(u'^[CFK]\\.?$')))(u'units').add_action(merge)\n",
    "value = R(u'^\\d+(\\.\\d+)?$')(u'value')\n",
    "bp = (prefix + value + units)(u'bp')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from chemdataextractor.parse.base import BaseParser\n",
    "from chemdataextractor.utils import first\n",
    "\n",
    "class BpParser(BaseParser):\n",
    "    root = bp\n",
    "\n",
    "    def interpret(self, result, start, end):\n",
    "        compound = Compound(\n",
    "            boiling_points=[\n",
    "                BoilingPoint(\n",
    "                    value=first(result.xpath('./value/text()')),\n",
    "                    units=first(result.xpath('./units/text()'))\n",
    "                )\n",
    "            ]\n",
    "        )\n",
    "        yield compound\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "Paragraph.parsers = [BpParser()]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Running the New Parser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'boiling_points': [{'units': '°C', 'value': '240'}],\n",
       "  'labels': ['3a'],\n",
       "  'names': ['2,4,6-trinitrotoluene'],\n",
       "  'roles': ['product']}]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d = Document(\n",
    "    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),\n",
    "    Paragraph(u'The procedure was followed to yield a pale yellow solid (b.p. 240 °C)')\n",
    ")\n",
    "\n",
    "d.records.serialize()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
